#TASK 1 GETTING STARTED

print("Hello, R!")
## [1] "Hello, R!"
sessionInfo()
## R version 4.4.3 (2025-02-28 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Karachi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.37     R6_2.6.1          fastmap_1.2.0     xfun_0.51        
##  [5] cachem_1.1.0      knitr_1.50        htmltools_0.5.8.1 rmarkdown_2.29   
##  [9] lifecycle_1.0.4   cli_3.6.4         sass_0.4.9        jquerylib_0.1.4  
## [13] compiler_4.4.3    tools_4.4.3       evaluate_1.0.3    bslib_0.9.0      
## [17] yaml_2.3.10       rlang_1.1.5       jsonlite_2.0.0

Load libraries

library(readxl) library(dplyr) library(ggplot2) library(caret)


# TASK 2 Working with Data Imports


``` r
data_csv <- read.csv("F:/R_codes/csv_data_iris.csv", header = TRUE, stringsAsFactors = FALSE)
head(data_csv)  # Display the first few rows of the dataset
##   sepal.length..cm. sepal.width..cm. petal.length..cm. petal.width..cm. target
## 1               5.1              3.5               1.4              0.2      0
## 2               4.9              3.0               1.4              0.2      0
## 3               4.7              3.2               1.3              0.2      0
## 4               4.6              3.1               1.5              0.2      0
## 5               5.0              3.6               1.4              0.2      0
## 6               5.4              3.9               1.7              0.4      0
##    class
## 1 setosa
## 2 setosa
## 3 setosa
## 4 setosa
## 5 setosa
## 6 setosa
library(readxl)
iris_data <- read_excel("F:/R_codes/excel_data_iris.xlsx")
head(iris_data)
## # A tibble: 6 × 6
##   `sepal length (cm)` `sepal width (cm)` `petal length (cm)` `petal width (cm)`
##                 <dbl>              <dbl>               <dbl>              <dbl>
## 1                 5.1                3.5                 1.4                0.2
## 2                 4.9                3                   1.4                0.2
## 3                 4.7                3.2                 1.3                0.2
## 4                 4.6                3.1                 1.5                0.2
## 5                 5                  3.6                 1.4                0.2
## 6                 5.4                3.9                 1.7                0.4
## # ℹ 2 more variables: target <dbl>, class <chr>

TASK 3 DATA PREPROCESSING AND SUMMARY

clean_iris <- na.omit(iris)  # Remove rows with missing values\

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
filtered_iris <- iris %>% filter(Sepal.Length > 5.5)

summary_data <- iris %>%
    group_by(Species) %>%
    summarize(mean_sepal_length = mean(Sepal.Length))
print(summary_data)
## # A tibble: 3 × 2
##   Species    mean_sepal_length
##   <fct>                  <dbl>
## 1 setosa                  5.01
## 2 versicolor              5.94
## 3 virginica               6.59

TASK 4 DATA VISUALIZATION

#BAR CHART
library(ggplot2)

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
    geom_bar(stat = "summary", fun = "mean") +
    labs(title = "Average Sepal Length per Species")

#SCATTER PLOT
ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, color = Species)) +
    geom_point() +
    labs(title = "Scatter Plot of Sepal vs Petal Length")

#INTERACTIVE VISUALIZATION
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(data = iris, x = ~Sepal.Length, y = ~Petal.Length, 
        type = "scatter", mode = "markers", color = ~Species)

TASK 5 ADVANCE ANALYSIS

#LINEAR REGRESSION
model <- lm(Petal.Length ~ Sepal.Length, data = iris)
summary(model)  # Check model details
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47747 -0.59072 -0.00668  0.60484  2.49512 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.10144    0.50666  -14.02   <2e-16 ***
## Sepal.Length  1.85843    0.08586   21.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8678 on 148 degrees of freedom
## Multiple R-squared:   0.76,  Adjusted R-squared:  0.7583 
## F-statistic: 468.6 on 1 and 148 DF,  p-value: < 2.2e-16
#CLUSTERING
clusters <- kmeans(iris[, 1:4], centers = 3)
iris$Cluster <- as.factor(clusters$cluster)

ggplot(iris, aes(Sepal.Length, Petal.Length, color = Cluster)) +
    geom_point()

#MACINE LEARNING WITH CARET
library(caret)
## Loading required package: lattice
model <- train(Species ~ ., data = iris, method = "rf", trControl = trainControl(method = "cv", number = 5))
print(model)
## Random Forest 
## 
## 150 samples
##   5 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 120, 120, 120, 120, 120 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa
##   2     0.9466667  0.92 
##   4     0.9466667  0.92 
##   6     0.9466667  0.92 
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Save Cleaned Data

write.csv(iris, "cleaned_iris.csv", row.names = FALSE)

Conclusion

This report analyzed the Iris dataset, performed exploratory data analysis, visualized relationships, and implemented a Random Forest classification model.